rm(list = ls())
library(stm)
library(stringr)
library(tm)

# the SenatePosts.csv file is omitted from this replication file for copyright reasons

### clean data
Senate <- read.csv('SenatePosts.csv', header = T, stringsAsFactors = F)
Senate$post_time <- as.numeric(gsub('-', '', substring(Senate$post_time, 1, 7)))
Senate <- Senate[substring(Senate$post_time, 1, 4) >= 2008,]
Senate$post_text <- gsub("\\?\x80\\?", "", Senate$post_text)
Senate <- Senate[-grep("new photo", Senate$post_text),]
Senate <- Senate[-grep("added a life event", Senate$post_text),]
Senate$word_count <- sapply(strsplit(Senate$post_text, " "), length)
Senate <- Senate[Senate$word_count >= 10,] #163698 obs

# remove spanish
# the most common Spanish word from Sketch Engine 
# \url(https://www.sketchengine.eu/spanish-word-list/)
spanish_dict <- read.csv('spanish-word-list-total.csv',
                         header = TRUE, stringsAsFactors = FALSE, sep = ";")
spanish_words <- spanish_dict[4:503, 2]
# remove the words also existing in English
spanish_words <- spanish_words[!spanish_words %in% c("a", "con", "no", "me", "santa", "favor", "sin", "era", "genral",
                                                     "mayor", "sea", "solo", "social", "primer", "tan", "fin", "embargo",
                                                     "final", "argentina", "sector", "personal", "director", "total", "van",
                                                     "control", "local", "chile", "base", "municipal", "plan", "cargo",
                                                     "capital", "actual", "idea", "red", "federal", "internet", "real",
                                                     "principal", "central", "web", "soy", "superior", "crisis", "etc.")]
spanish_words <- paste0("\\b", spanish_words, "\\b")
suspect <- vapply(Senate$post_text, 
                  function(x) sum(str_detect(x,  regex(spanish_words, ignore_case = T))),
                  numeric(1))
table(as.vector(suspect))
Senate$span_count <- as.vector(suspect)
Senate$span_rate <- Senate$span_count/Senate$word_count
View(Senate[Senate$span_count >= 5, c(3, 5, 6, 7)])
View(Senate[(Senate$span_count >= 4 & Senate$span_rate >= 0.2) | 
              (Senate$span_count >= 10) |
              (Senate$span_rate >= 0.3), c(3, 5, 6, 7)])
Senate <- Senate[!((Senate$span_count >= 4 & Senate$span_rate >= 0.2) | 
                    (Senate$span_count >= 10) |
                    (Senate$span_rate >= 0.3)),] #163642

### fit topic models
# step 1: pre-process, get the stm object
cust_stop <- c('senator', 'senate', 'sen', 
               'facebook', 'please', 'today', 'will',
               tolower(state.name), tolower(state.abb), 
               'hampshire', 'jersey', 'york', 'carolina', 'dakota', 'rhode')
docs = textProcessor(documents = as.character(Senate$post_text), 
                     metadata = Senate,
                     lowercase = TRUE, 
                     removestopwords = TRUE, 
                     removenumbers = TRUE, 
                     removepunctuation = TRUE,
                     stem = FALSE,
                     wordLengths = c(3, 20),
                     language = "en", 
                     striphtml = TRUE, 
                     customstopwords = cust_stop)
stmPrep <- prepDocuments(docs[[1]], docs[[2]], meta=docs$meta)
save(stmPrep, file = '../1_topicvalidation/models/stmPrep.Rdata')
heldout <- make.heldout(stmPrep$documents, stmPrep$vocab, seed = 806)
save(heldout, file = '../1_topicvalidation/models/heldout.Rdata')
documents <- heldout$documents
vocab <- heldout$vocab

# step 2: fit stm with different k
stm10k1it <- stm(documents, vocab, 10, 
                 init.type = "Spectral", max.em.its = 1)
save(stm10k1it, file = '../1_topicvalidation/models/stm10k1it.Rdata')
stm10k <- stm(documents, vocab, 10)
save(stm10k, file = '../1_topicvalidation/models/stm10k.Rdata')
stm50k <- stm(documents, vocab, 50)
save(stm50k, file = '../1_topicvalidation/models/stm50k.Rdata')
stm100k <- stm(documents, vocab, 100)
save(stm100k, file = '../1_topicvalidation/models/stm100k.Rdata')
stm500k <- stm(documents, vocab, 500)
save(stm500k, file = '../1_topicvalidation/models/stm500k.Rdata')

# inspect the stm results
labelTopics(stm10k1it, n=20)
labelTopics(stm10k, n=20)
labelTopics(stm50k, n=20)
labelTopics(stm100k, n=20)
labelTopics(stm500k, n=20)

### predict topics for each document for the label validation tasks
load('../1_topicvalidation/models/stm100k.Rdata')
load('../1_topicvalidation/models/stmPrep.Rdata')
load('../1_topicvalidation/models/heldout.Rdata')

ordertheta <- t(apply(stm100k$theta, 1, order, decreasing = TRUE))
textrank <- cbind(stmPrep$meta[-heldout$missing$index,1:4], ordertheta[-heldout$missing$index,1:3])
colnames(textrank)[5:7] <- c("top1", "top2", "top3")
domTopics <- c(1, 9, 21, 35, 44, 62, 65, 70, 88, 89)
# pool for optimal label
top1domText <- textrank[textrank$top1 %in% domTopics,]
write.csv(top1domText, 
          "../2_labelvalidation/textpool/top1domText.csv", 
          row.names = FALSE)
# pool for label intrusion
top3domText <- textrank[textrank$top1 %in% domTopics & textrank$top2 %in% domTopics & textrank$top3 %in% domTopics,]
write.csv(top3domText, 
          "../2_labelvalidation/textpool/top3domText.csv", 
          row.names = FALSE)

